if (!require("pacman")) install.packages("pacman")
pacman::p_load(tidyverse, broom, skimr, devtools, ggpubr, mgcv, extrafont, mgcViz, here)
theme_set(theme_pubclean(base_size = 14))
# load the function to print GAM figures
source(here("R", "p_gam.R"))
# import
lesion <- read_csv(here("cache", "summarised_lesion_data.csv"))
weather <- read_csv(here("cache", "weather_summary.csv"))
dat <- left_join(lesion, weather, by = c("site" = "Location", "rep" = "Rep"))
For reproducibility purposes, use set.seed().
set.seed(27)
mod1 <-
gam(
mean_pot_count ~ s(distance, k = 5),
data = dat
)
summary(mod1)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## mean_pot_count ~ s(distance, k = 5)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.08024 0.04751 22.74 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(distance) 3.926 3.996 78.4 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.482 Deviance explained = 48.8%
## GCV = 0.76522 Scale est. = 0.75394 n = 334
print(p_gam(x = getViz(mod1)) +
ggtitle("s(Distance)"),
pages = 1)
mod2 <-
gam(
mean_pot_count ~ sum_rain+ s(distance, k = 5),
data = dat
)
summary(mod2)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## mean_pot_count ~ sum_rain + s(distance, k = 5)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.126435 0.055804 20.186 <2e-16 ***
## sum_rain -0.011494 0.007325 -1.569 0.118
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(distance) 3.926 3.996 78.73 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.484 Deviance explained = 49.1%
## GCV = 0.76416 Scale est. = 0.7506 n = 334
print(p_gam(x = getViz(mod2)) +
ggtitle("s(Distance) + Precipitation"),
pages = 1)
mod3 <-
gam(mean_pot_count ~ mws + s(distance, k = 5),
data = dat)
summary(mod3)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## mean_pot_count ~ mws + s(distance, k = 5)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.64089 0.11706 5.475 8.71e-08 ***
## mws 0.12598 0.03081 4.088 5.47e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(distance) 3.929 3.996 82.13 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.505 Deviance explained = 51.3%
## GCV = 0.73257 Scale est. = 0.71956 n = 334
print(p_gam(x = getViz(mod3)) +
ggtitle("s(Distance) + Windspeed"),
pages = 1)
mod4 <-
gam(mean_pot_count ~ sum_rain + mws + s(distance, k = 5),
data = dat)
summary(mod4)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## mean_pot_count ~ sum_rain + mws + s(distance, k = 5)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.684697 0.119335 5.738 2.19e-08 ***
## sum_rain -0.012537 0.007154 -1.752 0.0806 .
## mws 0.127868 0.030735 4.160 4.07e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(distance) 3.93 3.996 82.64 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.508 Deviance explained = 51.7%
## GCV = 0.7302 Scale est. = 0.71505 n = 334
print(p_gam(x = getViz(mod4)) +
ggtitle("s(Distance) + Windspeed + Precipitation"),
pages = 1)
mod5 <-
gam(
mean_pot_count ~ sum_rain + s(distance + mws, k = 5),
data = dat
)
## Warning in term[i] <- attr(terms(reformulate(term[i])), "term.labels"): number
## of items to replace is not a multiple of replacement length
summary(mod5)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## mean_pot_count ~ sum_rain + s(distance + mws, k = 5)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.126435 0.055804 20.186 <2e-16 ***
## sum_rain -0.011494 0.007325 -1.569 0.118
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(distance) 3.926 3.996 78.73 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.484 Deviance explained = 49.1%
## GCV = 0.76416 Scale est. = 0.7506 n = 334
print(p_gam(x = getViz(mod5)) +
ggtitle("s(Distance + Windspeed) + Precipitation"),
pages = 1)
mod6 <-
gam(
mean_pot_count ~ sum_rain + s(distance, k = 5) + s(mws, k = 5),
data = dat
)
summary(mod6)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## mean_pot_count ~ sum_rain + s(distance, k = 5) + s(mws, k = 5)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.026368 0.056285 18.235 <2e-16 ***
## sum_rain 0.013404 0.008903 1.506 0.133
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(distance) 3.938 3.997 93.81 < 2e-16 ***
## s(mws) 3.929 3.996 16.11 2.89e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.566 Deviance explained = 57.8%
## GCV = 0.6497 Scale est. = 0.63051 n = 334
print(p_gam(x = getViz(mod6)) +
ggtitle("s(Distance) + s(Windspeed) + Precipitation"),
pages = 1)
mod7 <-
gam(
mean_pot_count ~ s(distance, k = 5) + s(mws, k = 5),
data = dat
)
summary(mod7)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## mean_pot_count ~ s(distance, k = 5) + s(mws, k = 5)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.08024 0.04353 24.82 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(distance) 3.938 3.997 93.42 < 2e-16 ***
## s(mws) 3.932 3.997 16.47 2.26e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.565 Deviance explained = 57.5%
## GCV = 0.6502 Scale est. = 0.63293 n = 334
print(p_gam(x = getViz(mod7)) +
ggtitle("s(Distance) + s(Windspeed)"),
pages = 1)
mod8 <-
gam(
mean_pot_count ~ s(distance, k = 5) + s(mws, k = 5) + s(sum_rain, k = 5),
data = dat
)
summary(mod8)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## mean_pot_count ~ s(distance, k = 5) + s(mws, k = 5) + s(sum_rain,
## k = 5)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.08024 0.04345 24.86 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(distance) 3.938 3.997 93.803 < 2e-16 ***
## s(mws) 3.029 3.101 9.215 2.97e-06 ***
## s(sum_rain) 1.876 1.892 2.435 0.154
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.566 Deviance explained = 57.8%
## GCV = 0.64966 Scale est. = 0.63051 n = 334
print(p_gam(x = getViz(mod8)) +
ggtitle("s(Distance) + s(Windspeed) + s(Precipitation)"),
pages = 1)
mod9 <-
gam(
mean_pot_count ~ s(distance, k = 5) + s(sum_rain, k = 5),
data = dat
)
summary(mod9)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## mean_pot_count ~ s(distance, k = 5) + s(sum_rain, k = 5)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.08024 0.04594 23.51 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(distance) 3.931 3.996 83.82 < 2e-16 ***
## s(sum_rain) 2.034 2.205 10.73 2.36e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.515 Deviance explained = 52.4%
## GCV = 0.71997 Scale est. = 0.70495 n = 334
print(p_gam(x = getViz(mod9)) +
ggtitle("s(Distance) + s(Precipitation)"),
pages = 1)
mod10 <-
gam(
mean_pot_count ~ s(distance, k = 5) + s(sum_rain, k = 5) + mws,
data = dat
)
summary(mod10)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## mean_pot_count ~ s(distance, k = 5) + s(sum_rain, k = 5) + mws
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.7875 1.4359 -1.941 0.05308 .
## mws 1.1090 0.4115 2.695 0.00741 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(distance) 3.936 3.997 93.81 < 2e-16 ***
## s(sum_rain) 3.819 3.967 13.49 1.1e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.566 Deviance explained = 57.8%
## GCV = 0.64947 Scale est. = 0.6305 n = 334
print(p_gam(x = getViz(mod10)) +
ggtitle("s(Distance) + s(Precipitation) + Windspeed"),
pages = 1)
This is the same as mod8 but using family = tw(), see ?family.mgcv for more on the families. The Tweedie distribution is used where the distribution has a positive mass at zero, but is continuous unlike the Poisson distribution that requires count data. The data visualisation shows clearly that the mean pot count data have this shape.
mod11.0 <-
gam(
mean_pot_count ~ s(distance, k = 5) +
s(mws, k = 5) +
s(sum_rain, k = 5),
data = dat,
family = tw()
)
summary(mod11.0)
##
## Family: Tweedie(p=1.044)
## Link function: log
##
## Formula:
## mean_pot_count ~ s(distance, k = 5) + s(mws, k = 5) + s(sum_rain,
## k = 5)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.22836 0.04099 -5.571 5.32e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(distance) 3.497 3.855 123.762 < 2e-16 ***
## s(mws) 2.937 3.053 14.119 4.58e-09 ***
## s(sum_rain) 1.901 1.929 7.169 0.00507 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.673 Deviance explained = 61.2%
## -REML = 309.71 Scale est. = 0.36396 n = 334
print(p_gam(x = getViz(mod11.0)) +
ggtitle("s(Distance) + s(Windspeed) + s(Precipitation), family = tw()"),
pages = 1)
mod11.1 <-
gam(
mean_pot_count ~ s(distance, k = 5, bs = "ts") +
s(mws, k = 5, bs = "ts") +
s(sum_rain, k = 5, bs = "ts"),
data = dat,
family = tw()
)
summary(mod11.1)
##
## Family: Tweedie(p=1.044)
## Link function: log
##
## Formula:
## mean_pot_count ~ s(distance, k = 5, bs = "ts") + s(mws, k = 5,
## bs = "ts") + s(sum_rain, k = 5, bs = "ts")
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.2256 0.0409 -5.517 7.03e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(distance) 3.2506 4 117.946 <2e-16 ***
## s(mws) 3.8535 4 23.589 <2e-16 ***
## s(sum_rain) 0.7535 3 1.015 0.0449 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.671 Deviance explained = 61%
## -REML = 316.95 Scale est. = 0.36407 n = 334
print(
p_gam(x = getViz(mod11.1)) +
ggtitle(
"s(Distance, bs = 'ts') + s(Windspeed, bs = 'ts')\n+ s(Precipitation, bs = 'ts'), family = tw()"
),
pages = 1
)
This model, same structure as mod11.0, uses thin-plate splines to shrink the coefficients of the smooth to zero.
models <- list(mod1 = mod1,
mod2 = mod2,
mod3 = mod3,
mod4 = mod4,
mod5 = mod5,
mod6 = mod6,
mod7 = mod7,
mod8 = mod8,
mod9 = mod9,
mod10 = mod10,
mod11.0 = mod11.0,
mod11.1 = mod11.1
)
map_df(models, glance, .id = "model") %>%
arrange(AIC)
## # A tibble: 12 x 7
## model df logLik AIC BIC deviance df.residual
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 mod11.0 9.34 -320. 663. 708. 141. 325.
## 2 mod11.1 8.86 -321. 666. 711. 141. 325.
## 3 mod10 9.76 -392. 805. 846. 204. 324.
## 4 mod8 9.84 -392. 805. 847. 204. 324.
## 5 mod6 9.87 -392. 806. 847. 204. 324.
## 6 mod7 8.87 -393. 806. 843. 206. 325.
## 7 mod9 6.96 -412. 840. 870. 231. 327.
## 8 mod4 6.93 -414. 845. 875. 234. 327.
## 9 mod3 5.93 -416. 846. 872. 236. 328.
## 10 mod2 5.93 -423. 860. 886. 246. 328.
## 11 mod5 5.93 -423. 860. 886. 246. 328.
## 12 mod1 4.93 -424. 860. 883. 248. 329.
enframe(c(
mod1 = summary(mod1)$r.sq,
mod2 = summary(mod2)$r.sq,
mod3 = summary(mod3)$r.sq,
mod4 = summary(mod4)$r.sq,
mod5 = summary(mod5)$r.sq,
mod6 = summary(mod6)$r.sq,
mod7 = summary(mod7)$r.sq,
mod8 = summary(mod8)$r.sq,
mod9 = summary(mod9)$r.sq,
mod10 = summary(mod10)$r.sq,
mod11.0 = summary(mod11.0)$r.sq,
mod11.1 = summary(mod11.1)$r.sq
)) %>%
arrange(desc(value))
## # A tibble: 12 x 2
## name value
## <chr> <dbl>
## 1 mod11.0 0.673
## 2 mod11.1 0.671
## 3 mod10 0.566
## 4 mod6 0.566
## 5 mod8 0.566
## 6 mod7 0.565
## 7 mod9 0.515
## 8 mod4 0.508
## 9 mod3 0.505
## 10 mod2 0.484
## 11 mod5 0.484
## 12 mod1 0.482
anova(mod1,
mod2,
mod3,
mod4,
mod5,
mod6,
mod7,
mod8,
mod9,
mod10,
mod11.0,
mod11.1,
test = "F")
## Analysis of Deviance Table
##
## Model 1: mean_pot_count ~ s(distance, k = 5)
## Model 2: mean_pot_count ~ sum_rain + s(distance, k = 5)
## Model 3: mean_pot_count ~ mws + s(distance, k = 5)
## Model 4: mean_pot_count ~ sum_rain + mws + s(distance, k = 5)
## Model 5: mean_pot_count ~ sum_rain + s(distance + mws, k = 5)
## Model 6: mean_pot_count ~ sum_rain + s(distance, k = 5) + s(mws, k = 5)
## Model 7: mean_pot_count ~ s(distance, k = 5) + s(mws, k = 5)
## Model 8: mean_pot_count ~ s(distance, k = 5) + s(mws, k = 5) + s(sum_rain,
## k = 5)
## Model 9: mean_pot_count ~ s(distance, k = 5) + s(sum_rain, k = 5)
## Model 10: mean_pot_count ~ s(distance, k = 5) + s(sum_rain, k = 5) + mws
## Model 11: mean_pot_count ~ s(distance, k = 5) + s(mws, k = 5) + s(sum_rain,
## k = 5)
## Model 12: mean_pot_count ~ s(distance, k = 5, bs = "ts") + s(mws, k = 5,
## bs = "ts") + s(sum_rain, k = 5, bs = "ts")
## Resid. Df Resid. Dev Df Deviance F Pr(>F)
## 1 329.00 248.10
## 2 328.00 246.25 1.00004796 1.85 5.0793 0.02488 *
## 3 328.00 236.07 0.00033734 10.18 82920.5912 1.835e-11 ***
## 4 327.00 233.87 1.00005877 2.20 6.0360 0.01454 *
## 5 328.00 246.25 -1.00039611 -12.38 33.9953 1.330e-08 ***
## 6 324.01 204.37 3.99764373 41.88 28.7764 < 2.2e-16 ***
## 7 325.01 205.79 -0.99979946 -1.42 3.8900 0.04943 *
## 8 324.01 204.39 0.99651105 1.40 3.8610 0.05042 .
## 9 326.80 230.55 -2.78870751 -26.16 25.7666 3.509e-14 ***
## 10 324.04 204.44 2.76263340 26.11 25.9575 3.555e-14 ***
## 11 323.66 639.48 0.37405937 -435.04
## 12 323.61 642.99 0.04695074 -3.51
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
mod11.0_vis <- getViz(mod11.0)
check(mod11.0_vis,
a.qq = list(method = "tnorm",
a.cipoly = list(fill = "light blue")),
a.respoi = list(size = 0.5),
a.hist = list(bins = 10))
##
## Method: REML Optimizer: outer newton
## full convergence after 8 iterations.
## Gradient range [-5.35515e-06,9.353029e-08]
## (score 309.7133 & scale 0.363963).
## Hessian positive definite, eigenvalue range [0.394613,2978.459].
## Model rank = 13 / 13
##
## Basis dimension (k) checking results. Low p-value (k-index<1) may
## indicate that k is too low, especially if edf is close to k'.
##
## k' edf k-index p-value
## s(distance) 4.00 3.50 0.86 0.010 **
## s(mws) 4.00 2.94 0.89 0.045 *
## s(sum_rain) 4.00 1.90 0.90 0.050 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
mod11.1_vis <- getViz(mod11.1)
check(mod11.1_vis,
a.qq = list(method = "tnorm",
a.cipoly = list(fill = "light blue")),
a.respoi = list(size = 0.5),
a.hist = list(bins = 10))
##
## Method: REML Optimizer: outer newton
## full convergence after 11 iterations.
## Gradient range [-5.96296e-09,2.343652e-10]
## (score 316.948 & scale 0.3640653).
## Hessian positive definite, eigenvalue range [0.2822617,2979.48].
## Model rank = 13 / 13
##
## Basis dimension (k) checking results. Low p-value (k-index<1) may
## indicate that k is too low, especially if edf is close to k'.
##
## k' edf k-index p-value
## s(distance) 4.000 3.251 0.86 0.01 **
## s(mws) 4.000 3.853 0.89 0.02 *
## s(sum_rain) 4.000 0.753 0.89 0.02 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
This model, mod11.0, mean_pot_count ~ s(Distance) + s(Windspeed) + s(Precipitation) - family = tw(), lacks enough numbers of basis functions for two of three predictors, distance, wsp, but it’s close. Realistically we need to increase the k values for these predictors, but we’re lacking the data to do this.